import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


"""
1
    创建爬虫文件的命令（cmd命令）
    scrapy genspider -t crawl 爬虫文件名 爬取网站的域名
2
"""


class ReadbookspiderSpider(CrawlSpider):
    name = "readbookspider"
    allowed_domains = ["www.dushu.com"]
    start_urls = ["https://www.dushu.com/book/1188.html"]

    # allow里面写用来做解析的正则表达式
    rules = (
        Rule(
            LinkExtractor(allow=r"/book/1188_\d+\.html"),
            callback="parse_item",
            follow=True,
        ),
    )

    def parse_item(self, response):
        item = {}
        # item["domain_id"] = response.xpath('//input[@id="sid"]/@value').get()
        # item["name"] = response.xpath('//div[@id="name"]').get()
        # item["description"] = response.xpath('//div[@id="description"]').get()
        return item
